This notebook will likely be broken into at least two
#| default_exp core
import os
import numpy as np
import pandas as pd
import plotly.express as px
# import hilbertcurve
# from hilbertcurve.hilbertcurve import HilbertCurve
import tqdm
from tqdm import tqdm
import dlgwas
from dlgwas.dna import *
# ! conda install openpyxl -y
# ! conda install hilbertcurve -y
# ! pip install hilbertcurve
For this I'm using data referenced in Wallace et al 2014 which is available through panzea. This study refers to data from 9 studies (including itself) as a source of phenotypes for the NAM data. This combination of a large set of published GWAS hits, phenotypes, and many rils makes it ideal for use here.
This file contains results I can use to check if my approaches are producing similar hits.
Wallace_etal_2014_PLoSGenet_GWAS_hits = pd.read_table('../ext_data/zma/panzea/GWASResults/Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt')
Wallace_etal_2014_PLoSGenet_GWAS_hits.head()
| trait | chr | pos | allele | rmip | source | |
|---|---|---|---|---|---|---|
| 0 | 100 Kernel weight | 1 | 3364007 | A/G | 1 | Hapmap1 |
| 1 | 100 Kernel weight | 1 | 22247033 | A/G | 3 | Hapmap1 |
| 2 | 100 Kernel weight | 1 | 22987420 | C/T | 1 | Hapmap2 |
| 3 | 100 Kernel weight | 1 | 23056483 | C/G | 1 | Hapmap2 |
| 4 | 100 Kernel weight | 1 | 23066099 | A/G | 1 | Hapmap2 |
This file on I think matches Buckler et al 2009.
temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx',
skiprows=1
).rename(columns = {
'Unnamed: 1': 'ASI',
'Unnamed: 2': 'Days to Anthesis',
'Unnamed: 3': 'Days to Silk'
})
temp = temp.loc[temp.index == 0]
temp.head()
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed for idx, row in parser.parse():
| Heritability of line BLUPs across all crosses (excluding association panel): | ASI | Days to Anthesis | Days to Silk | |
|---|---|---|---|---|
| 0 | Heritability of line BLUPs | 0.775037 | 0.944619 | 0.942365 |
temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx',
skiprows=4
).rename(columns = {
'Unnamed: 1': 'ASI',
'Unnamed: 2': 'Days to Anthesis',
'Unnamed: 3': 'Days to Silk'
})
temp.head()
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed for idx, row in parser.parse():
| Heritability of line BLUPs within each cross: | ASI | Days to Anthesis | Days to Silk | |
|---|---|---|---|---|
| 0 | B73×B97 | 0.64151 | 0.770513 | 0.766082 |
| 1 | B73×CML103 | 0.613736 | 0.777381 | 0.734942 |
| 2 | B73×CML228 | 0.680939 | 0.903684 | 0.892442 |
| 3 | B73×CML247 | 0.748584 | 0.900707 | 0.894291 |
| 4 | B73×CML277 | 0.708526 | 0.918574 | 0.911493 |
# pull in some of the data that Wallace et al 2014 point to:
buckler_2009_path = '../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/'
os.listdir(buckler_2009_path)
['NAM_DaysToTassel.txt', 'NAM_TasselingDate.txt', 'markergenotypes062508.txt', 'NAM_SilkingDate.txt', 'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', 'markers061208.txt', 'NAM_DaysToSilk.txt']
nam_dts = pd.read_table(buckler_2009_path+'NAM_DaysToSilk.txt', encoding="ISO-8859-1")
nam_dts_taxa = list(nam_dts.loc[:, 'accename'].drop_duplicates())
# Look for the right taxa
# genome_files.sort()
# genome_files[0:10]
# # I think we need to match everything before the __
# def find_AGPv4_genome(
# taxa, # should be the desired taxa or a regex fragment (stopping before the __). E.g. 'B73' or 'B\d+'
# **kwargs # optionally pass in a genome list (this allows for a different path or precomputing if we're finding a lot of genomes)
# ):
# if 'genome_files' not in kwargs.keys():
# import os
# genome_files = os.listdir(
# '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
# else:
# genome_files = kwargs['genome_files']
# import re
# return( [e for e in genome_files if re.match(taxa+'__.+', e)] )
genome_files = os.listdir(
'../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
possible_matches = [{'taxa': e,
'matches': find_AGPv4(
taxa = e,
genome_files = genome_files
)} for e in tqdm(nam_dts_taxa)]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5369/5369 [01:25<00:00, 63.16it/s]
# how many have more than one match?
len(
[[len(e['matches']), e] for e in possible_matches if len(e['matches']) != 1])
1297
'Z018E0021'
# ith_taxa = '05-397:250007467'
# res = get_AGPv4(taxa_to_filename(taxa = ith_taxa)) # Retrieve record
# temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]
# temp[res[0]] = res[1:] # Add Col. with Nucleotides
# temp.head()
'Z018E0021'
pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")
| Geno_Code | Entry_ID | Group | pop | entry | Days_To_Anthesis_BLUP_Sum0607 | Days_To_Silk_BLUP_Sum0607 | ASI_BLUP_Sum0607 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Z001E0001 | 04P1367A51A | Z001 | 1 | 1 | 75.5364 | 77.1298 | 1.4600 |
| 1 | Z001E0002 | 04P1368A51A | Z001 | 1 | 2 | 76.9075 | 77.7945 | 1.3928 |
| 2 | Z001E0003 | 04P1368B51A | Z001 | 1 | 3 | 75.2646 | 75.2555 | 0.8644 |
| 3 | Z001E0004 | 04P1370B51A | Z001 | 1 | 4 | 73.6933 | 75.7604 | 2.0012 |
| 4 | Z001E0005 | 04P1371B51A | Z001 | 1 | 5 | 79.2441 | 81.2611 | 1.8931 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5458 | Z027E0277 | W64A | NaN | 27 | 277 | 71.9008 | 73.9811 | 2.6756 |
| 5459 | Z027E0278 | WD | NaN | 27 | 278 | 62.0212 | 60.5992 | -0.5733 |
| 5460 | Z027E0279 | Wf9 | NaN | 27 | 279 | 71.9970 | 72.2319 | 0.8338 |
| 5461 | Z027E0280 | Yu796_NS | NaN | 27 | 280 | 74.5107 | 73.9727 | 0.2935 |
| 5462 | Z027E0282 | Mo17 | NaN | 27 | 282 | 72.7428 | 75.5080 | 3.0455 |
5463 rows × 8 columns
pd.read_table(buckler_2009_path+'NAM_TasselingDate.txt', encoding="ISO-8859-1")
| loc_name | year | season | pop | block | plot | row | plot_id | entry_num | pedigree | seed_source | accename | value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Urbana, IL | 2006 | Summer | 18 | 1 | 1 | 1 | 0650001 | 21 | (B73×Mo18W)S5 | 05CL1572-blk | Z018E0021 | 07/28/06 |
| 1 | Urbana, IL | 2006 | Summer | 18 | 1 | 2 | 2 | 0650002 | 41 | (B73×Mo18W)S5 | 05FL4779-blk | Z018E0041 | 07/31/06 |
| 2 | Urbana, IL | 2006 | Summer | 18 | 1 | 3 | 3 | 0650003 | 128 | (B73×Mo18W)S5 | 24MM1520 | Z018E0128 | 07/31/06 |
| 3 | Urbana, IL | 2006 | Summer | 18 | 1 | 4 | 4 | 0650004 | 146 | (B73×Mo18W)S5 | 05P114751A | Z018E0146 | 07/31/06 |
| 4 | Urbana, IL | 2006 | Summer | 18 | 1 | 5 | 5 | 0650005 | 105 | (B73×Mo18W)S5 | 24MM1482 | Z018E0105 | 08/02/06 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43454 | Columbia, MO | 2007 | Summer | 12 | 3 | 16 | 2936 | 27M32936 | 179 | (B73×Ki11)S5 | 05P025251A | Z012E0179 | 7/25/2007 |
| 43455 | Columbia, MO | 2007 | Summer | 12 | 3 | 17 | 2937 | 27M32937 | 182 | (B73×Ki11)S5 | 05P025351A | Z012E0182 | 7/23/2007 |
| 43456 | Columbia, MO | 2007 | Summer | 12 | 3 | 18 | 2938 | 27M32938 | 180 | (B73×Ki11)S5 | 05P029351A | Z012E0180 | 7/23/2007 |
| 43457 | Columbia, MO | 2007 | Summer | 12 | 3 | 19 | 2939 | 27M32939 | 168 | (B73×Ki11)S5 | 05P030451A | Z012E0168 | 7/25/2007 |
| 43458 | Columbia, MO | 2007 | Summer | 12 | 3 | 20 | 2940 | 27M32940 | 8 | (B73×Ki11)S5 | 05CL1149-blk | Z012E0008 | 7/21/2007 |
43459 rows × 13 columns
# pd.read_table(buckler_2009_path+'markergenotypes062508.txt', encoding="ISO-8859-1")
pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")
| Geno_Code | Entry_ID | Group | pop | entry | Days_To_Anthesis_BLUP_Sum0607 | Days_To_Silk_BLUP_Sum0607 | ASI_BLUP_Sum0607 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Z001E0001 | 04P1367A51A | Z001 | 1 | 1 | 75.5364 | 77.1298 | 1.4600 |
| 1 | Z001E0002 | 04P1368A51A | Z001 | 1 | 2 | 76.9075 | 77.7945 | 1.3928 |
| 2 | Z001E0003 | 04P1368B51A | Z001 | 1 | 3 | 75.2646 | 75.2555 | 0.8644 |
| 3 | Z001E0004 | 04P1370B51A | Z001 | 1 | 4 | 73.6933 | 75.7604 | 2.0012 |
| 4 | Z001E0005 | 04P1371B51A | Z001 | 1 | 5 | 79.2441 | 81.2611 | 1.8931 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5458 | Z027E0277 | W64A | NaN | 27 | 277 | 71.9008 | 73.9811 | 2.6756 |
| 5459 | Z027E0278 | WD | NaN | 27 | 278 | 62.0212 | 60.5992 | -0.5733 |
| 5460 | Z027E0279 | Wf9 | NaN | 27 | 279 | 71.9970 | 72.2319 | 0.8338 |
| 5461 | Z027E0280 | Yu796_NS | NaN | 27 | 280 | 74.5107 | 73.9727 | 0.2935 |
| 5462 | Z027E0282 | Mo17 | NaN | 27 | 282 | 72.7428 | 75.5080 | 3.0455 |
5463 rows × 8 columns
# os.listdir('../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
AGPv4_path = '../data/zma/panzea/genotypes/GBS/v27/'
# Other than listing the taxa this isn't expected to be of much use for our purposes.
AGPv4_taxa=pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
AGPv4_taxa.head()
| Taxa | LibraryPrepID | Status | DNAPlate | GENUS | INBREEDF | SPECIES | DNASample | Flowcell_Lane | NumLanes | ... | GermplasmSet | Barcode | LibraryPlate | Tassel4SampleName | Population | LibraryPlateWell | SampleDNAWell | OwnerEmail | PEDIGREE | SeedLot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 05-397:250007467 | 250007467 | public | P3-GS-F | Zea | 0.95 | mays | 05-397 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTTGAA | P3-GS-F | 05-397:C00R8ABXX:4:250007467 | Inbred | F11 | F11 | esb33@cornell.edu | NaN | NaN |
| 1 | 05-438:250007407 | 250007407 | public | P3-GS-F | Zea | 0.95 | mays | 05-438 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTATT | P3-GS-F | 05-438:C00R8ABXX:4:250007407 | Inbred | B03 | B03 | esb33@cornell.edu | NaN | NaN |
| 2 | 12E:250032344 | 250032344 | public | Ames12 | Zea | 0.95 | mays | 12E | 81FE8ABXX_4 | 1.0 | ... | Ames | GCTGTGGA | Ames12 | 12E:81FE8ABXX:4:250032344 | inbred | H08 | H08 | esb33@cornell.edu | 12E | NaN |
| 3 | 207:250007202 | 250007202 | public | P1-GS-F | Zea | 0.95 | mays | 207 | C00R8ABXX_2 | 1.0 | ... | Margaret Smith lines | TACAT | P1-GS-F | 207:C00R8ABXX:2:250007202 | Inbred | E12 | E12 | esb33@cornell.edu | NaN | NaN |
| 4 | 22612:250007466 | 250007466 | public | P3-GS-F | Zea | 0.95 | mays | 22612 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTACTT | P3-GS-F | 22612:C00R8ABXX:4:250007466 | Inbred | F10 | F10 | esb33@cornell.edu | NaN | NaN |
5 rows × 21 columns
# Useful for converting between the physical location and site
AGPv4_site = pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()
| Site | Name | Chromosome | Position | |
|---|---|---|---|---|
| 0 | 0 | S1_6370 | 1 | 52399 |
| 1 | 1 | S1_8210 | 1 | 54239 |
| 2 | 2 | S1_8376 | 1 | 54405 |
| 3 | 3 | S1_9889 | 1 | 55917 |
| 4 | 4 | S1_9899 | 1 | 55927 |
Retrieving a genome by taxa name:
# The genomes are in a folder with an identical name as their source table
table_directory = AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# Note however that the naming is altered to not use ':'
os.listdir(table_directory)[0:3]
['xztea', 'xzedh', 'Z020E0024__250026132']
taxa_to_filename(taxa = '05-397:250007467')
'05-397__250007467'
def get_AGPv4(taxa):
with open(table_directory+taxa, 'r') as f:
data = f.read()
data = data.split('\t')
return(data)
get_AGPv4('05-397__250007467')[0:4]
['05-397:250007467', 'T', 'T', 'A']
In addition to returning a specific taxa, the table's headers can be retieved with "taxa".
get_AGPv4(taxa = 'taxa')[0:4]
['Taxa', '52399', '54239', '54405']
Converting between site and chromosome/position requires the AGPv4_site dataframe. A given record contains the taxa as well as the nucleotides, so with that entry excluded the chromosome / position can be paired up.
len(get_AGPv4(taxa = 'taxa')), AGPv4_site.shape
(943456, (943455, 4))
ith_taxa = '05-397:250007467'
res = get_AGPv4(taxa_to_filename(taxa = ith_taxa)) # Retrieve record
temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]
temp[res[0]] = res[1:] # Add Col. with Nucleotides
temp.head()
| Chromosome | Position | 05-397:250007467 | |
|---|---|---|---|
| 0 | 1 | 52399 | T |
| 1 | 1 | 54239 | T |
| 2 | 1 | 54405 | A |
| 3 | 1 | 55917 | N |
| 4 | 1 | 55927 | N |
mask = (temp.Chromosome == 1)
temp_pos = temp.loc[mask, ['Position']]
temp_pos['Shift'] = 0
temp_pos.loc[1: , ['Shift']] = np.array(temp_pos.Position)[:-1]
temp_pos['Diff'] = temp_pos['Position'] - temp_pos['Shift']
temp_pos.loc[0, 'Diff'] = None
temp_pos
| Position | Shift | Diff | |
|---|---|---|---|
| 0 | 52399 | 0 | NaN |
| 1 | 54239 | 52399 | 1840.0 |
| 2 | 54405 | 54239 | 166.0 |
| 3 | 55917 | 54405 | 1512.0 |
| 4 | 55927 | 55917 | 10.0 |
| ... | ... | ... | ... |
| 147145 | 306971046 | 306910117 | 60929.0 |
| 147146 | 306971061 | 306971046 | 15.0 |
| 147147 | 306971063 | 306971061 | 2.0 |
| 147148 | 306971073 | 306971063 | 10.0 |
| 147149 | 306971080 | 306971073 | 7.0 |
147150 rows × 3 columns
# px.histogram(temp_pos, x = 'Diff')
# demonstrating the hilbert curve
temp = np.linspace(1, 100, num= 50)
# px.scatter(x = temp, y = [0 for e in range(len(temp))], color = temp)
px.imshow(temp.reshape((1, temp.shape[0])))
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
# n = 2 # dimensions
# )
# distances = list(range(len(temp)))
# points = hilbert_curve.points_from_distances(distances)
# # px.line(pd.DataFrame(points, columns = ['i', 'j']), x = 'i', y = 'j')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[40], line 1 ----> 1 hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions 2 n = 2 # dimensions 3 ) 4 distances = list(range(len(temp))) 6 points = hilbert_curve.points_from_distances(distances) NameError: name 'HilbertCurve' is not defined
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
# for i in range(len(temp)):
# # print(i)
# temp_mat[points[i][0], points[i][1]] = temp[i]
# # temp2 = pd.DataFrame(points, columns = ['i', 'j'])
# # temp2['value'] = temp
# # px.scatter(temp2, x = 'i', y = 'j', color = 'value')
# px.imshow(temp_mat)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[41], line 1 ----> 1 dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing 2 dim_1 = np.max(np.array(points)[:, 1])+1 3 temp_mat = np.zeros(shape = [dim_0, dim_1]) NameError: name 'points' is not defined
# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
# n = 2 # dimensions
# )
# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists
# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
# for i in range(len(temp)):
# # print(i)
# temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[42], line 3 1 # Data represented need not be continuous -- it need only have int positions 2 # a sequence or a sequence with gaps can be encoded ----> 3 hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions 4 n = 2 # dimensions 5 ) 8 fake_dists = list(range(len(temp))) 9 # Introdude a gap in the sequence NameError: name 'HilbertCurve' is not defined
# temp_pos['Present'] = 1
# temp_pos.shape[0]
147150
# def calc_needed_hilbert_p(n_needed = 1048576,
# max_p = 20):
# out = None
# for i in range(1, max_p):
# if 4**i > n_needed:
# out = i
# break
# return(out)
# calc_needed_hilbert_p(n_needed=147150)
9
# temp_pos['Position']
0 52399
1 54239
2 54405
3 55917
4 55927
...
147145 306971046
147146 306971061
147147 306971063
147148 306971073
147149 306971080
Name: Position, Length: 147150, dtype: int64
# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
# n = 2 # dimensions
# )
# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists
# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
# for i in range(len(temp)):
# # print(i)
# temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)